{
    "text": {
        "dataset": {
            "train": 25000,
            "dev": 25000,
            "test": 25000
        },
        "training": {
            "mixed_precision": false,
            "batch_size": 32,
            "learning_rate": 0.0001,
            "warmup": 8000,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 500,
            "num_train_steps": 20000,
            "num_eval_steps": 779,
            "gradient_accumulation": 1
        },
        "model": {
            "pooling_mode": "cls",
            "common": {
                "dim_model": 256,
                "num_classes": 2,
                "seq_len": 4096,
                "num_heads": 4,
                "vocab_size": 512,
                "dropout": 0.1
            },
            "xformer": [
                {
                    "reversible": false,
                    "block_type": "encoder",
                    "num_layers": 4,
                    "residual_norm_style": "pre",
                    "position_encoding_config": {
                        "name": "vocab"
                    },
                    "multi_head_config": {
                        "residual_dropout": 0.1,
                        "attention": {
                            "name": "generic",
                            "causal": false
                        }
                    },
                    "feedforward_config": {
                        "name": "MLP",
                        "activation": "gelu",
                        "hidden_layer_multiplier": 4
                    }
                }
            ]
        },
        "extra_settings": {
            "attention": {
                "favor": {
                    "dim_features": 256,
                    "iter_before_redraw": 1000
                },
                "nystrom": {
                    "conv_kernel_size": 35,
                    "num_landmarks": 128
                }
            }
        }
    },
    "listops": {
        "dataset": {
            "train": 96000,
            "dev": 2000,
            "test": 2000
        },
        "training": {
            "mixed_precision": false,
            "batch_size": 32,
            "learning_rate": 0.0001,
            "warmup": 1000,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 50,
            "num_train_steps": 10000,
            "num_eval_steps": 62,
            "gradient_accumulation": 2
        },
        "model": {
            "pooling_mode": "mean",
            "common": {
                "num_classes": 10,
                "num_heads": 2,
                "dim_model": 64,
                "seq_len": 2048,
                "vocab_size": 32,
                "dropout": 0.1
            },
            "xformer": [
                {
                    "reversible": false,
                    "block_type": "encoder",
                    "num_layers": 2,
                    "residual_norm_style": "pre",
                    "position_encoding_config": {
                        "name": "vocab"
                    },
                    "multi_head_config": {
                        "residual_dropout": 0.1,
                        "attention": {
                            "name": "generic",
                            "causal": false
                        }
                    },
                    "feedforward_config": {
                        "name": "MLP",
                        "activation": "gelu",
                        "hidden_layer_multiplier": 2
                    }
                }
            ],
            "extra_settings": {
                "attention": {
                    "favor": {
                        "dim_features": 256,
                        "iter_before_redraw": 1000
                    },
                    "nystrom": {
                        "conv_kernel_size": 35,
                        "num_landmarks": 128
                    }
                }
            }
        }
    },
    "retrieval": {
        "dataset": {
            "train": 147086,
            "dev": 18090,
            "test": 17437
        },
        "training": {
            "mixed_precision": false,
            "batch_size": 32,
            "learning_rate": 0.0001,
            "warmup": 800,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 300,
            "num_train_steps": 30000,
            "num_eval_steps": 565,
            "gradient_accumulation": 2
        },
        "model": {
            "pooling_mode": "mean",
            "common": {
                "num_classes": 2,
                "num_heads": 2,
                "seq_len": 4096,
                "dim_model": 64,
                "vocab_size": 512,
                "dropout": 0.1
            },
            "xformer": [
                {
                    "reversible": true,
                    "block_type": "encoder",
                    "num_layers": 2,
                    "residual_norm_style": "pre",
                    "position_encoding_config": {
                        "name": "vocab"
                    },
                    "multi_head_config": {
                        "residual_dropout": 0.1,
                        "attention": {
                            "name": "generic",
                            "causal": false
                        }
                    },
                    "feedforward_config": {
                        "name": "MLP",
                        "activation": "gelu",
                        "hidden_layer_multiplier": 2
                    }
                }
            ],
            "extra_settings": {
                "attention": {
                    "favor": {
                        "dim_features": 256,
                        "iter_before_redraw": 1000
                    },
                    "nystrom": {
                        "conv_kernel_size": 35,
                        "num_landmarks": 128
                    }
                }
            }
        }
    },
    "pathfinder32": {
        "training": {
            "mixed_precision": false,
            "batch_size": 256,
            "learning_rate": 0.0001,
            "warmup": 312,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 312,
            "num_train_steps": 62400,
            "num_eval_steps": 312,
            "gradient_accumulation": 1
        },
        "model": {
            "pooling_mode": "mean",
            "common": {
                "vocab_size": 512,
                "num_classes": 2,
                "num_heads": 2,
                "seq_len": 1024,
                "dim_model": 64,
                "dropout": 0.1
            },
            "xformer": [
                {
                    "reversible": true,
                    "block_type": "encoder",
                    "num_layers": 2,
                    "residual_norm_style": "pre",
                    "position_encoding_config": {
                        "name": "vocab"
                    },
                    "multi_head_config": {
                        "residual_dropout": 0.1,
                        "attention": {
                            "name": "generic",
                            "causal": false
                        }
                    },
                    "feedforward_config": {
                        "name": "MLP",
                        "activation": "gelu",
                        "hidden_layer_multiplier": 2
                    }
                }
            ],
            "extra_settings": {
                "attention": {
                    "favor": {
                        "dim_features": 256,
                        "iter_before_redraw": 1000
                    },
                    "nystrom": {
                        "conv_kernel_size": 35,
                        "num_landmarks": 128
                    }
                }
            }
        }
    },
    "image": {
        "dataset": {
            "train": 45000,
            "dev": 5000,
            "test": 10000
        },
        "training": {
            "mixed_precision": false,
            "batch_size": 256,
            "learning_rate": 0.0001,
            "warmup": 175,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 175,
            "num_train_steps": 35000,
            "num_eval_steps": 20,
            "gradient_accumulation": 1
        },
        "model": {
            "pooling_mode": "mean",
            "common": {
                "vocab_size": 512,
                "num_classes": 10,
                "num_heads": 2,
                "seq_len": 1024,
                "dim_model": 64,
                "dropout": 0.1
            },
            "xformer": [
                {
                    "reversible": true,
                    "block_type": "encoder",
                    "num_layers": 2,
                    "residual_norm_style": "pre",
                    "position_encoding_config": {
                        "name": "vocab"
                    },
                    "multi_head_config": {
                        "residual_dropout": 0.1,
                        "attention": {
                            "name": "generic",
                            "causal": false
                        }
                    },
                    "feedforward_config": {
                        "name": "MLP",
                        "activation": "gelu",
                        "hidden_layer_multiplier": 2
                    }
                }
            ],
            "extra_settings": {
                "attention": {
                    "favor": {
                        "dim_features": 256,
                        "iter_before_redraw": 1000
                    },
                    "nystrom": {
                        "conv_kernel_size": 35,
                        "num_landmarks": 128
                    }
                }
            }
        }
    }
}
